{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sentiment Analysis\n",
    "We will use the IMDB sentiment analysis database in this tutorial. The main idea that is used in this tutorial is that certain words are enough to establish the sentiment of a given sentence. The word order is discarded in this particular tutorial.\n",
    "<img src=\"../images/angry_happy_dogo.png\" alt=\"dogo\" style=\"width: 500px;\"/>\n",
    "\n",
    "## References\n",
    "1. http://ai.stanford.edu/~amaas/data/sentiment/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/TIRsS6ktXqA?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from IPython.display import HTML\n",
    "\n",
    "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/TIRsS6ktXqA?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "import urllib\n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "np.random.seed(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Reviews</th>\n",
       "      <th>Sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Bromwell High is a cartoon comedy. It ran at t...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Homelessness (or Houselessness as George Carli...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Brilliant over-acting by Lesley Ann Warren. Be...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>This is easily the most underrated film inn th...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>This is not the typical Mel Brooks film. It wa...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             Reviews  Sentiment\n",
       "0  Bromwell High is a cartoon comedy. It ran at t...          1\n",
       "1  Homelessness (or Houselessness as George Carli...          1\n",
       "2  Brilliant over-acting by Lesley Ann Warren. Be...          1\n",
       "3  This is easily the most underrated film inn th...          1\n",
       "4  This is not the typical Mel Brooks film. It wa...          1"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "if not os.path.isfile('data/reviews2.pkl'):\n",
    "    urllib.request.urlretrieve('https://www.dropbox.com/s/15tfttuzqe7fimg/reviews2.pkl?dl=1','./data/reviews2.pkl')\n",
    "    \n",
    "df = pd.read_pickle('data/reviews2.pkl')    \n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as \"Teachers\". My 35 years in the teaching profession lead me to believe that Bromwell High\\'s satire is much closer to reality than is \"Teachers\". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\\'t!'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Reviews.values[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000, 2)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Reviews</th>\n",
       "      <th>Sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12495</th>\n",
       "      <td>Towards the end of the movie, I felt it was to...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12496</th>\n",
       "      <td>This is the kind of movie that my enemies cont...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12497</th>\n",
       "      <td>I saw 'Descent' last night at the Stockholm Fi...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12498</th>\n",
       "      <td>Some films that you pick up for a pound turn o...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12499</th>\n",
       "      <td>This is one of the dumbest films, I've ever se...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 Reviews  Sentiment\n",
       "12495  Towards the end of the movie, I felt it was to...          0\n",
       "12496  This is the kind of movie that my enemies cont...          0\n",
       "12497  I saw 'Descent' last night at the Stockholm Fi...          0\n",
       "12498  Some films that you pick up for a pound turn o...          0\n",
       "12499  This is one of the dumbest films, I've ever se...          0"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In the first model that we look at we simply count the number of words in each document. This method is called **Bag of words**. It's considered a bag since we lose all ordering of the words. The features are simply the count of each word. \n",
    "\n",
    "Lucky for us Scikit Learn contains this feature extractor as a method. See http://scikit-learn.org/stable/modules/feature_extraction.html#common-vectorizer-usage for an example. the `fit_transform` method gives us the matrix that we are after."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tf_vectorizer = CountVectorizer()\n",
    "tf = tf_vectorizer.fit_transform(df.Reviews.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of unique words:  74849\n",
      "['00', '000', '0000000000001', '00001', '00015']\n"
     ]
    }
   ],
   "source": [
    "print('Number of unique words: ',len(tf_vectorizer.get_feature_names()))\n",
    "print(tf_vectorizer.get_feature_names()[:5])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Unfortunately this is extremely noisy as can be seen by the existence of random numbers such as `00` etc. Hence we will do two additional things. We will only consider words that have appeared atleast 5 times. Also words such as `the, a, to` are irrelevant in figuring out what the sentiment is. Hence we shall remove such words. These are commonly known as _stop words_ in Natural Language Processing (NLP)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "New number of unique words:  26967\n"
     ]
    }
   ],
   "source": [
    "tf_vectorizer = CountVectorizer(min_df=5,stop_words='english')\n",
    "tf = tf_vectorizer.fit_transform(df.Reviews.values)\n",
    "print('New number of unique words: ',len(tf_vectorizer.get_feature_names()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Notice that tf is a sparse matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'scipy.sparse.csr.csr_matrix'>\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(25000, 26967)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(type(tf))\n",
    "tf.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will use `np.random.permutation` to shuffle the data. It is really important to shuffle the data before we put it into any machine learning algorithm. See below as to how `np.random.permutation` works."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([2, 1, 4, 0, 3])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.permutation(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "idx = np.random.permutation(len(df))\n",
    "X_train = tf[idx][:12500].todense()\n",
    "X_test = tf[idx][12500:].todense()\n",
    "y_train = df.Sentiment.values[idx][:12500]\n",
    "y_test = df.Sentiment.values[idx][12500:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(12500, 26967)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Keras Model\n",
    "Below we create our first deep learning model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Activation, Dropout\n",
    "from keras.regularizers import l2, l1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "dense_1 (Dense)              (None, 100)               2696800   \n",
      "_________________________________________________________________\n",
      "dense_2 (Dense)              (None, 1)                 101       \n",
      "=================================================================\n",
      "Total params: 2,696,901\n",
      "Trainable params: 2,696,901\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model = Sequential()\n",
    "model.add(Dense(units=100, activation='relu', input_dim=tf.shape[1]))\n",
    "model.add(Dense(units=1, activation='sigmoid'))\n",
    "# model.add(Activation(\"sigmoid\"))\n",
    "model.compile(loss='binary_crossentropy', optimizer='adagrad', metrics=[\"binary_accuracy\"])\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/2\n",
      "12500/12500 [==============================] - 7s - loss: 0.3658 - binary_accuracy: 0.8470     \n",
      "Epoch 2/2\n",
      "12500/12500 [==============================] - 6s - loss: 0.1127 - binary_accuracy: 0.9664     \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7fcc32273eb8>"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(X_train, y_train, epochs=2, batch_size=128)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Keras has two ways of predicting `model.predict(...)` which predicts the **probability** of belonging to a class (basically the value it gets after passing through final activation), and `model.predict_class(...)` which as the name suggests outputs the class. Here we will use the `predict` method.\n",
    "\n",
    "**This is the probability of the class being 1 (positive review in this case).**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_test_pred = model.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(12500, 1)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[  8.21291283e-03],\n",
       "       [  1.78651214e-02],\n",
       "       [  2.26298347e-02],\n",
       "       ..., \n",
       "       [  9.89961624e-01],\n",
       "       [  9.61236179e-01],\n",
       "       [  2.32975523e-07]], dtype=float32)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(y_test_pred.shape)\n",
    "y_test_pred"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let us manually convert this to 0 or 1 class. Using numpy arrays"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.882"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_test_pred[y_test_pred<0.5] = 0\n",
    "y_test_pred[y_test_pred>=0.5] = 1\n",
    "np.count_nonzero(y_test_pred==y_test[:,None])*1.0/len(y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let us test some manual test cases. Fee free to change the sentence below. Really important to note that I am using `transform` and not `fit_transform` below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.38682193]], dtype=float32)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_case = tf_vectorizer.transform([\"I really hated this movie\"])\n",
    "model.predict(test_case.todense())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1, 26967)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_case.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.67480838]], dtype=float32)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_case = tf_vectorizer.transform([\"I really loved this movie\"])\n",
    "model.predict(test_case.todense())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[0, 0, 0, ..., 0, 0, 0]])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_case.todense()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(12500, 26967)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Improving the model\n",
    "\n",
    "Counting is one of the most basic things you could do when it comes to NLP applications. A more popular method that tends to be a bit better is Term Frequency - inverse Document Frequency (Tfidf) method. Not only does it take frequncy (term frequency) into account, but it also penalises words for being overused (for example the word 'the' would be downvoted since it is too popular). The intuition being that it does not have any discriminatory power if the word is used in every sentence. Whereas rare words might be able to distinguish the contents of the sentence better."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tfidf_vectorizer = TfidfVectorizer(min_df=5,stop_words='english')\n",
    "tfidf = tfidf_vectorizer.fit_transform(df.Reviews.values)\n",
    "\n",
    "# get the new features\n",
    "X_train = tfidf[idx][:12500].todense()\n",
    "X_test = tfidf[idx][12500:].todense()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(12500, 26967)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2696800"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "26967*100 + 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "dense_5 (Dense)              (None, 1)                 26968     \n",
      "_________________________________________________________________\n",
      "activation_3 (Activation)    (None, 1)                 0         \n",
      "=================================================================\n",
      "Total params: 26,968\n",
      "Trainable params: 26,968\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model = Sequential()\n",
    "# model.add(Dense(units=100, input_dim=tf.shape[1]))\n",
    "# model.add(Activation(\"relu\"))\n",
    "model.add(Dense(units=1, input_dim=tf.shape[1]))\n",
    "model.add(Activation(\"sigmoid\"))\n",
    "model.compile(loss='binary_crossentropy', optimizer='adagrad', metrics=[\"binary_accuracy\"])\n",
    "\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/2\n",
      "12500/12500 [==============================] - 2s - loss: 0.6653 - binary_accuracy: 0.8119     \n",
      "Epoch 2/2\n",
      "12500/12500 [==============================] - 1s - loss: 0.6330 - binary_accuracy: 0.8794     \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7fcd7db849e8>"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(X_train, y_train, epochs=2, batch_size=128)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "12500/12500 [==============================] - 1s     \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[0.63244026380538942, 0.8455999998664856]"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.evaluate(X_test, y_test, batch_size=128)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8868"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_test_pred = model.predict(X_test)\n",
    "y_test_pred[y_test_pred<0.5] = 0\n",
    "y_test_pred[y_test_pred>=0.5] = 1\n",
    "np.count_nonzero(y_test_pred==y_test[:,None])/len(y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As can be seen the model is much better with our test data as seen below"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.08907689]], dtype=float32)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_case = tf_vectorizer.transform([\"I really hated this movie\"])\n",
    "model.predict(test_case.todense())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.99189383]], dtype=float32)"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_case = tf_vectorizer.transform([\"I really loved this movie\"])\n",
    "model.predict(test_case.todense())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAD8CAYAAACLrvgBAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFm1JREFUeJzt3X+s3fV93/Hna9Ai1A7KD48SQ2YinE3AWndYDlKTLh0t\nOGEqpILEqAquhiARLGu0ThMsfxARIcHWBI1loSLB4ocSfgySYimwzIGq0dTa4RKh8CMhXIIjbDng\n2B50a2Exee+P87no+HLt++Gee+/xtZ8P6avzPe/v9/M9n/O14eXv9/M556SqkCSpxz8YdwckSUuH\noSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqduR4+7AfDvxxBNrxYoV4+6GJC0p\nTzzxxM+qatls+x1yobFixQomJibG3Q1JWlKS/KRnP29PSZK6GRqSpG6GhiSpm6EhSepmaEiSuhka\nkqRuhoYkqZuhIUnqZmhIkrodcp8Ilw4GK6755lvrW2+8YIw9keaXoSHNk+GgkA5Vhoa0wLzq0KFk\n1jGNJBuSvJLk6aHafUmebMvWJE+2+ookfz+07c+H2pyd5Kkkk0luSZJWP6odbzLJliQrhtqsT/J8\nW9bP5xuXJL1zPVcadwBfBO6aKlTVx6bWk3weeHVo/xeqatUMx7kVuALYAjwMrAUeAS4H9lTV6UnW\nATcBH0tyPHAdsBoo4IkkG6tqT//bkyTNp1lDo6q+M/yv/2HtauGjwL880DGSnAwcU1Wb2/O7gIsY\nhMaFwGfbrg8AX2zHPR/YVFW7W5tNDILmntn6LB2svFWlpW7UKbcfAF6uqueHaqe1W1N/leQDrbYc\n2Da0z7ZWm9r2EkBV7WVw1XLCcH2GNvtIcmWSiSQTO3fuHPEtSZL2Z9SB8EvZ91/+O4B3V9WuJGcD\nf5HkzBFfY1ZVdRtwG8Dq1atroV9PmuKMKR1u5nylkeRI4A+B+6ZqVfVGVe1q608ALwDvBbYDpww1\nP6XVaI+nDh3zWGDXcH2GNpKkMRjl9tTvAT+sqrduOyVZluSItv4eYCXw46raAbyW5Jw2XnEZ8FBr\nthGYmhl1MfBYVRXwLeC8JMclOQ44r9UkSWMy6+2pJPcAHwROTLINuK6qbgfW8fZB6d8Brk/yc+AX\nwCenBrKBqxjMxDqawQD4I61+O3B3kklgdzsuVbU7yeeAx9t+1w8dS5I0Bj2zpy7dT/2PZ6g9CDy4\nn/0ngLNmqL8OXLKfNhuADbP1UZK0OPzCQklSN0NDktTN0JAkdfMLC6V3yM9m6HDmlYYkqZuhIUnq\n5u0paUz88kItRV5pSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKk\nboaGJKmboSFJ6jZraCTZkOSVJE8P1T6bZHuSJ9vy4aFt1yaZTPJckvOH6mcneaptuyVJWv2oJPe1\n+pYkK4barE/yfFvWz9ebliTNTc+33N4BfBG4a1r95qr6s+FCkjOAdcCZwLuAbyd5b1W9CdwKXAFs\nAR4G1gKPAJcDe6rq9CTrgJuAjyU5HrgOWA0U8ESSjVW1Z07vVBqBP7wkDcx6pVFV3wF2dx7vQuDe\nqnqjql4EJoE1SU4GjqmqzVVVDALooqE2d7b1B4Bz21XI+cCmqtrdgmITg6CRJI3JKGMan0ry/Xb7\n6rhWWw68NLTPtlZb3tan1/dpU1V7gVeBEw5wLEnSmMw1NG4F3gOsAnYAn5+3Hs1BkiuTTCSZ2Llz\n5zi7Is3Jimu++dYiHczmFBpV9XJVvVlVvwC+DKxpm7YDpw7tekqrbW/r0+v7tElyJHAssOsAx5qp\nP7dV1eqqWr1s2bK5vCVJUoc5hUYbo5jyEWBqZtVGYF2bEXUasBL4blXtAF5Lck4br7gMeGiozdTM\nqIuBx9q4x7eA85Ic125/nddqkqQxmXX2VJJ7gA8CJybZxmBG0weTrGIwq2kr8AmAqnomyf3As8Be\n4Oo2cwrgKgYzsY5mMGvqkVa/Hbg7ySSDAfd17Vi7k3wOeLztd31V9Q7IS5IWwKyhUVWXzlC+/QD7\n3wDcMEN9AjhrhvrrwCX7OdYGYMNsfZQkLQ4/ES5J6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhka\nkqRuhoYkqZuhIUnqZmhIkroZGpKkbj0/9yodlvxtC+ntvNKQJHUzNCRJ3bw9JR1khm+Lbb3xgjH2\nRHo7rzQkSd0MDUlSN0NDktTN0JAkdTM0JEndZg2NJBuSvJLk6aHaf07ywyTfT/KNJL/W6iuS/H2S\nJ9vy50Ntzk7yVJLJJLckSasfleS+Vt+SZMVQm/VJnm/L+vl845Kkd67nSuMOYO202ibgrKr6DeBH\nwLVD216oqlVt+eRQ/VbgCmBlW6aOeTmwp6pOB24GbgJIcjxwHfA+YA1wXZLj3sF7kyTNs1lDo6q+\nA+yeVvufVbW3Pd0MnHKgYyQ5GTimqjZXVQF3ARe1zRcCd7b1B4Bz21XI+cCmqtpdVXsYBNX08JIk\nLaL5GNP418AjQ89Pa7em/irJB1ptObBtaJ9trTa17SWAFkSvAicM12dos48kVyaZSDKxc+fOUd+P\nJGk/RgqNJJ8B9gJfbaUdwLurahXw74CvJTlmtC7Orqpuq6rVVbV62bJlC/1yknTYmnNoJPlj4F8B\nf9RuOVFVb1TVrrb+BPAC8F5gO/vewjql1WiPp7ZjHgkcC+wars/QRpI0BnMKjSRrgf8A/EFV/d1Q\nfVmSI9r6exgMeP+4qnYAryU5p41XXAY81JptBKZmRl0MPNZC6FvAeUmOawPg57WaJGlMZv3CwiT3\nAB8ETkyyjcGMpmuBo4BNbebs5jZT6neA65P8HPgF8MmqmhpEv4rBTKyjGYyBTI2D3A7cnWSSwYD7\nOoCq2p3kc8Djbb/rh44lSRqDWUOjqi6doXz7fvZ9EHhwP9smgLNmqL8OXLKfNhuADbP1UZK0OPxE\nuCSpm6EhSepmaEiSuvnLfdKQ4V/Nk/R2XmlIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSp\nm5/TkA5iw58b2XrjBWPsiTTglYYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6zRoaSTYk\neSXJ00O145NsSvJ8ezxuaNu1SSaTPJfk/KH62UmeattuSZJWPyrJfa2+JcmKoTbr22s8n2T9fL1p\nSdLc9Fxp3AGsnVa7Bni0qlYCj7bnJDkDWAec2dp8KckRrc2twBXAyrZMHfNyYE9VnQ7cDNzUjnU8\ncB3wPmANcN1wOEmSFt+soVFV3wF2TytfCNzZ1u8ELhqq31tVb1TVi8AksCbJycAxVbW5qgq4a1qb\nqWM9AJzbrkLOBzZV1e6q2gNs4u3hJUlaRHMd0zipqna09Z8CJ7X15cBLQ/tta7XlbX16fZ82VbUX\neBU44QDHkiSNycgD4e3KoeahL3OW5MokE0kmdu7cOc6uSNIhba6h8XK75UR7fKXVtwOnDu13Sqtt\nb+vT6/u0SXIkcCyw6wDHepuquq2qVlfV6mXLls3xLUmSZjPX0NgITM1mWg88NFRf12ZEncZgwPu7\n7VbWa0nOaeMVl01rM3Wsi4HH2tXLt4DzkhzXBsDPazVJ0pjM+tXoSe4BPgicmGQbgxlNNwL3J7kc\n+AnwUYCqeibJ/cCzwF7g6qp6sx3qKgYzsY4GHmkLwO3A3UkmGQy4r2vH2p3kc8Djbb/rq2r6gLwk\naRHNGhpVdel+Np27n/1vAG6YoT4BnDVD/XXgkv0cawOwYbY+SpIWh58IlyR1MzQkSd38uVcd1oZ/\nTlXS7LzSkCR1MzQkSd0MDUlSN0NDktTN0JAkdXP2lLREDM/02nrjBWPsiQ5nXmlIkroZGpKkboaG\nJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqducQyPJP0ny5NDyWpJPJ/ls\nku1D9Q8Ptbk2yWSS55KcP1Q/O8lTbdstSdLqRyW5r9W3JFkxypuVJI1mzqFRVc9V1aqqWgWcDfwd\n8I22+eapbVX1MECSM4B1wJnAWuBLSY5o+98KXAGsbMvaVr8c2FNVpwM3AzfNtb+SpNHN1+2pc4EX\nquonB9jnQuDeqnqjql4EJoE1SU4GjqmqzVVVwF3ARUNt7mzrDwDnTl2FSJIW33yFxjrgnqHnn0ry\n/SQbkhzXasuBl4b22dZqy9v69Po+bapqL/AqcMI89VmS9A6NHBpJfhn4A+C/t9KtwHuAVcAO4POj\nvkZHH65MMpFkYufOnQv9cpJ02JqPK40PAd+rqpcBqurlqnqzqn4BfBlY0/bbDpw61O6UVtve1qfX\n92mT5EjgWGDX9A5U1W1VtbqqVi9btmwe3pIkaSbz8ct9lzJ0ayrJyVW1oz39CPB0W98IfC3JF4B3\nMRjw/m5VvdlmXp0DbAEuA/7rUJv1wN8AFwOPtXEPac6GfwFP0jszUmgk+RXg94FPDJX/U5JVQAFb\np7ZV1TNJ7geeBfYCV1fVm63NVcAdwNHAI20BuB24O8kksJvB2IkkaUxGCo2q+r9MG5iuqo8fYP8b\ngBtmqE8AZ81Qfx24ZJQ+SpLmj58IlyR1MzQkSd0MDUlSt/mYPSVpkQ3PANt64wVj7IkON15pSJK6\nGRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6\nGRqSpG6GhiSpm7+nocPC8O9PSJq7ka40kmxN8lSSJ5NMtNrxSTYleb49Hje0/7VJJpM8l+T8ofrZ\n7TiTSW5JklY/Ksl9rb4lyYpR+itJGs183J763apaVVWr2/NrgEeraiXwaHtOkjOAdcCZwFrgS0mO\naG1uBa4AVrZlbatfDuypqtOBm4Gb5qG/kqQ5WogxjQuBO9v6ncBFQ/V7q+qNqnoRmATWJDkZOKaq\nNldVAXdNazN1rAeAc6euQiRJi2/U0Cjg20meSHJlq51UVTva+k+Bk9r6cuClobbbWm15W59e36dN\nVe0FXgVOmN6JJFcmmUgysXPnzhHfkiRpf0YdCH9/VW1P8o+ATUl+OLyxqipJjfgas6qq24DbAFav\nXr3grycdTIYH+bfeeMEYe6LDwUhXGlW1vT2+AnwDWAO83G450R5fabtvB04dan5Kq21v69Pr+7RJ\nciRwLLBrlD5LkuZuzqGR5FeS/MOpdeA84GlgI7C+7bYeeKitbwTWtRlRpzEY8P5uu5X1WpJz2njF\nZdPaTB3rYuCxNu4hSRqDUW5PnQR8o41LHwl8rar+R5LHgfuTXA78BPgoQFU9k+R+4FlgL3B1Vb3Z\njnUVcAdwNPBIWwBuB+5OMgnsZjD7SpI0JnMOjar6MfCbM9R3Aefup80NwA0z1CeAs2aovw5cMtc+\nSpLml18jIknqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSermz73qkOVPvErz\nzysNSVI3rzSkQ4i/raGF5pWGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSus05NJKcmuQv\nkzyb5Jkkf9Lqn02yPcmTbfnwUJtrk0wmeS7J+UP1s5M81bbdkiStflSS+1p9S5IVc3+rkqRRjXKl\nsRf406o6AzgHuDrJGW3bzVW1qi0PA7Rt64AzgbXAl5Ic0fa/FbgCWNmWta1+ObCnqk4HbgZuGqG/\nkqQRzTk0qmpHVX2vrf8t8ANg+QGaXAjcW1VvVNWLwCSwJsnJwDFVtbmqCrgLuGiozZ1t/QHg3Kmr\nEEnS4puXrxFpt41+C9gC/DbwqSSXARMMrkb2MAiUzUPNtrXaz9v69Drt8SWAqtqb5FXgBOBn89Fv\nHXr8kkJpYY08EJ7kV4EHgU9X1WsMbjW9B1gF7AA+P+prdPThyiQTSSZ27ty50C8nLQkrrvnmW4s0\nX0YKjSS/xCAwvlpVXweoqper6s2q+gXwZWBN2307cOpQ81NabXtbn17fp02SI4FjgV3T+1FVt1XV\n6qpavWzZslHekiTpAEaZPRXgduAHVfWFofrJQ7t9BHi6rW8E1rUZUacxGPD+blXtAF5Lck475mXA\nQ0Nt1rf1i4HH2riHJGkMRhnT+G3g48BTSZ5stf8IXJpkFVDAVuATAFX1TJL7gWcZzLy6uqrebO2u\nAu4AjgYeaQsMQunuJJPAbgazryRJYzLn0Kiq/wXMNJPp4QO0uQG4YYb6BHDWDPXXgUvm2kdJ0vzy\nE+GSpG6GhiSpm6EhSepmaEiSus3LJ8KlcfLDa7MbPkdbb7xgjD3RUueVhiSpm6EhSepmaEiSuhka\nkqRuhoYkqZuzp6TDjDOpNApDQ0uS02yl8fD2lCSpm6EhSerm7SnpMOb4ht4pQ0NLhuMY0vh5e0qS\n1M0rDUmAt6rUx9DQQc1bUtLBxdCQ9DbTw9orD01ZEqGRZC3wX4AjgK9U1Y1j7pIWkFcXBx9vXWnK\nQR8aSY4A/hvw+8A24PEkG6vq2fH2TKMyHJam/f25GSaHh4M+NIA1wGRV/Rggyb3AhYChsUQYDocH\nw+TwsBRCYznw0tDzbcD7xtSXw4r/s9d8WOi/R4bS4loKoTGrJFcCV7an/yfJc2PszonAz8b4+gcD\nz4HnABbpHOSmhX6FkSylvwf/uGenpRAa24FTh56f0mpvqarbgNsWs1P7k2SiqlaPux/j5DnwHIDn\nAA7Nc7AUPhH+OLAyyWlJfhlYB2wcc58k6bB00F9pVNXeJP8G+BaDKbcbquqZMXdLkg5LB31oAFTV\nw8DD4+5Hp4PiNtmYeQ48B+A5gEPwHKSqxt0HSdISsRTGNCRJBwlDY0RJjk+yKcnz7fG4GfY5Nclf\nJnk2yTNJ/mQcfV0oPeeg7bchyStJnl7sPi6EJGuTPJdkMsk1M2xPklva9u8n+efj6OdC6jgH/zTJ\n3yR5I8m/H0cfF1rHOfij9uf/VJK/TvKb4+jnfDE0RncN8GhVrQQebc+n2wv8aVWdAZwDXJ3kjEXs\n40LrOQcAdwBrF6tTC2no620+BJwBXDrDn+mHgJVtuRK4dVE7ucA6z8Fu4N8Cf7bI3VsUnefgReBf\nVNU/Az7HEh/nMDRGdyFwZ1u/E7ho+g5VtaOqvtfW/xb4AYNPuh8qZj0HAFX1HQb/EzkUvPX1NlX1\n/4Cpr7cZdiFwVw1sBn4tycmL3dEFNOs5qKpXqupx4Ofj6OAi6DkHf11Ve9rTzQw+a7ZkGRqjO6mq\ndrT1nwInHWjnJCuA3wK2LGy3FtU7OgeHiJm+3mb6PwR69lnKDvX31+OdnoPLgUcWtEcLbElMuR23\nJN8Gfn2GTZ8ZflJVlWS/09GS/CrwIPDpqnptfnu5sObrHEiHqyS/yyA03j/uvozC0OhQVb+3v21J\nXk5yclXtaLceXtnPfr/EIDC+WlVfX6CuLpj5OAeHmFm/3qZzn6XsUH9/PbrOQZLfAL4CfKiqdi1S\n3xaEt6dGtxFY39bXAw9N3yFJgNuBH1TVFxaxb4tl1nNwCOr5epuNwGVtFtU5wKtDt/EOBX7FT8c5\nSPJu4OvAx6vqR2Po4/yqKpcRFuAEBjOGnge+DRzf6u8CHm7r7wcK+D7wZFs+PO6+L+Y5aM/vAXYw\nGBTdBlw+7r6P+L4/DPwIeAH4TKt9EvhkWw+DmTUvAE8Bq8fd5zGcg19vf9avAf+7rR8z7n4v8jn4\nCrBn6L/9iXH3eZTFT4RLkrp5e0qS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUrf/\nD73D/oKlUOzqAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7fcd7dd1bfd0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(model.get_weights()[0].ravel(),100)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<tf.Variable 'dense_3/kernel:0' shape=(26967, 100) dtype=float32_ref>,\n",
       " <tf.Variable 'dense_3/bias:0' shape=(100,) dtype=float32_ref>,\n",
       " <tf.Variable 'dense_4/kernel:0' shape=(100, 1) dtype=float32_ref>,\n",
       " <tf.Variable 'dense_4/bias:0' shape=(1,) dtype=float32_ref>]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  },
  "latex_envs": {
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 0
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
